rm(list=ls())
library(here)
## Warning: package 'here' was built under R version 4.2.2
## here() starts at C:/Users/pprasa6/Documents/GitHub/globalmix-mozambique
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.3
library(plotly)
## Warning: package 'plotly' was built under R version 4.2.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
participants <- readRDS(paste0(here(),"/data/clean/participant_data_aim1.RDS"))
contacts <- readRDS(paste0(here(),"/data/clean/contact_data_aim1.RDS"))

## Subset contacts to the IDs in participant list only
contacts <- contacts %>%
              dplyr::filter(rec_id %in% unlist(participants$rec_id))

# Filter contact relationship == self (you cannot have a contact with yourself)
contacts <- contacts %>%
  dplyr::filter(hh_member_relationship != "Self")



# relationship
contacts$hh_membership <- factor(contacts$hh_membership,
                                          levels = c("Member", "Non-member"))


# recategorize contact locations
contacts <- contacts %>%
              mutate(cnt_home = ifelse(location_contact___0==1, 1,0),
                     cnt_school = ifelse(location_contact___2==1, 1,0),
                     cnt_work = ifelse(location_contact___3==1, 1,0),
                     cnt_otherplace = ifelse(location_contact___1==1 | 
                                               location_contact___4==1 | 
                                               location_contact___5==1 | 
                                               location_contact___6==1 | 
                                               location_contact___7==1 |
                                               location_contact___8==1 | 
                                               location_contact___9==1 |
                                               location_contact___10==1 | 
                                               location_contact___11==1,1,0))
# masking
contacts <- contacts %>%
  mutate(contact_mask2 = case_when(contact_mask == "Yes, for the entire encounter" ~ "Yes",
                                   contact_mask == "Yes, during parts of encounter" ~ "Yes",
                                   contact_mask == "No mask was worn during the encounter" ~ "No",
                                   TRUE ~ "Can't recall"))
contacts$contact_mask2 <- factor(contacts$contact_mask2, levels = c("Yes", "No",
                                                                    "Can't recall"))

contacts$cnt_home <- factor(contacts$cnt_home, levels = c(1, 0),
                            labels = c("Yes", "No"))
contacts$cnt_work <- factor(contacts$cnt_work, levels = c(1, 0),
                            labels = c("Yes", "No"))
contacts$cnt_school <- factor(contacts$cnt_school, levels = c(1, 0),
                            labels = c("Yes", "No"))
contacts$cnt_otherplace <- factor(contacts$cnt_otherplace, levels = c(1, 0),
                            labels = c("Yes", "No"))


df_contact <- contacts %>%
  dplyr::select(-study_site) %>%
  left_join(participants, by=("rec_id"))

df_contact_d1 <- df_contact %>%
  dplyr::filter(study_day==1)

day1_num_contacts <- df_contact_d1 %>%
  dplyr::group_by(rec_id, study_site) %>%
  dplyr::summarize(num_contacts = n()) %>%
  dplyr::select(rec_id, num_contacts)
## `summarise()` has grouped output by 'rec_id'. You can override using the
## `.groups` argument.
df_contact_d1 <- left_join(df_contact_d1, day1_num_contacts, by="rec_id" )

df_contact_d2 <- df_contact %>%
  dplyr::filter(study_day==2)

day2_num_contacts <- df_contact_d2 %>%
  dplyr::group_by(rec_id, study_site) %>%
  dplyr::summarize(num_contacts = n()) %>%
  dplyr::select(rec_id, num_contacts)
## `summarise()` has grouped output by 'rec_id'. You can override using the
## `.groups` argument.
df_contact_d2 <- left_join(df_contact_d2, day2_num_contacts, by="rec_id" )

both_num_contacts <- df_contact %>%
  dplyr::group_by(rec_id, study_day, study_site) %>%
  dplyr::summarize(num_contacts = n()) %>%
  dplyr::select(rec_id, study_day, num_contacts)
## `summarise()` has grouped output by 'rec_id', 'study_day'. You can override
## using the `.groups` argument.
df_contact <- left_join(df_contact, both_num_contacts, by=c("rec_id", "study_day") )

# rural contacts
df_contact_rural <- df_contact %>%
  dplyr::filter(study_site == "Rural")

## specify participant characteristic stratification for cross tabs/analysis
variables <- data.frame(var=c("participant_sex","participant_age", "mask_uselb", 
                              "hh_membership","occupation", "weekday", "enrolled_school",
                              "touch_contact", "where_contact", "cnt_home", "cnt_school",
                              "cnt_work", "cnt_otherplace", "frequency_contact",
                              "known_contact", "contact_mask2", "ari_symptom", "age_symptom"),
                        # "hh_member_relationship",
                        name=c("Sex", "Age", "Do you wear a mask?",
                               "Household membership", "Occupation", "Weekday/Weekend",
                               "Enrolled in school", "Did you touch?", "Contact location",
                               "Contact at home", "Contact at work", 
                               "Contact at school", "Contact in other locations",
                               "Frequency of contact", "Do you know the contact?", 
                               "Contact wearing mask", "ARI symptoms", "AGE symptoms")) %>%
                        #  "Relationship to household member",
  
  mutate(var = as.character(var),
         name = as.character(name))

list <- list(0)

for (i in 1:nrow(variables)){
  x <- df_contact_rural[,variables$var[[i]]] 
  # include another variable n as the number of participants per strata
  variables$var[[i]]
  
# Number and proportion of contacts in each strata
  
  t0 <- as.data.frame(cbind(table(x), # Total 
                            round(prop.table(table(x))*100, digits=0) # Proportion
                            )) # number of participants
 
  colnames(t0)[1:2] <- c("Total","Col") 
  Tot <- rep("",5)
  
# Median contacts for day 1
  t1 <- df_contact_d1 %>%
    dplyr::filter(study_site == "Rural") %>%
    # since this is total contacts per person, keep only 1 distinct record
    distinct(rec_id, .keep_all = TRUE) %>% 
    dplyr::group_by(.dots = variables$var[[i]]) %>% 
    # na.omit() %>%
    do(data.frame(t(quantile(.$num_contacts, na.rm=TRUE, probs=c(0.25,0.5,0.75))))) # Median and IQR      
  t1$med_contact <- as.character(paste(t1$X50.," (",t1$X25.,"-",t1$X75.,")",sep="")) # Formatting for export
  
  # Median contacts for day 2
  t2 <- df_contact_d2 %>%
    dplyr::filter(study_site == "Rural") %>%
    # since this is total contacts per person, keep only 1 distinct record
    distinct(rec_id, .keep_all = TRUE) %>% 
    dplyr::group_by(.dots = variables$var[[i]]) %>%
    # na.omit() %>%
    do(data.frame(t(quantile(.$num_contacts, na.rm=TRUE, probs=c(0.25,0.5,0.75))))) # Median and IQR      
  t2$med_contact <- as.character(paste(t2$X50.," (",t2$X25.,"-",t2$X75.,")",sep="")) # Formatting for export
  
  # Bind columns together and select relevant columns
  t0<-qpcR:::cbind.na(t0, t1$med_contact, t2$med_contact) 
  
  # convert to numerical and round off?
  
  t0<- t0[,c(1,2,3,4)]
  
  # rename total column
  t0$Total <- paste(t0$Total," (","", t0$Col,")",sep="")
  t0 <- t0[,c(1,3,4)]
  t0[,1]<-as.character(t0[,1])
  t0[,2]<-as.character(t0[,2])

  t0<-rbind(Tot, t0)
  
  rownames(t0)[1]<-variables$name[[i]]
  list[[i]] <- t0
}
## Warning: The `.dots` argument of `group_by()` is deprecated as of dplyr 1.0.0.
## ℹ The deprecated feature was likely used in the dplyr package.
##   Please report the issue at <]8;;https://github.com/tidyverse/dplyr/issueshttps://github.com/tidyverse/dplyr/issues]8;;>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning in rbind(deparse.level, ...): number of columns of result, 3, is not a
## multiple of vector length 5 of arg 1

## Warning in rbind(deparse.level, ...): number of columns of result, 3, is not a
## multiple of vector length 5 of arg 1

## Warning in rbind(deparse.level, ...): number of columns of result, 3, is not a
## multiple of vector length 5 of arg 1

## Warning in rbind(deparse.level, ...): number of columns of result, 3, is not a
## multiple of vector length 5 of arg 1

## Warning in rbind(deparse.level, ...): number of columns of result, 3, is not a
## multiple of vector length 5 of arg 1

## Warning in rbind(deparse.level, ...): number of columns of result, 3, is not a
## multiple of vector length 5 of arg 1

## Warning in rbind(deparse.level, ...): number of columns of result, 3, is not a
## multiple of vector length 5 of arg 1

## Warning in rbind(deparse.level, ...): number of columns of result, 3, is not a
## multiple of vector length 5 of arg 1

## Warning in rbind(deparse.level, ...): number of columns of result, 3, is not a
## multiple of vector length 5 of arg 1

## Warning in rbind(deparse.level, ...): number of columns of result, 3, is not a
## multiple of vector length 5 of arg 1

## Warning in rbind(deparse.level, ...): number of columns of result, 3, is not a
## multiple of vector length 5 of arg 1

## Warning in rbind(deparse.level, ...): number of columns of result, 3, is not a
## multiple of vector length 5 of arg 1

## Warning in rbind(deparse.level, ...): number of columns of result, 3, is not a
## multiple of vector length 5 of arg 1

## Warning in rbind(deparse.level, ...): number of columns of result, 3, is not a
## multiple of vector length 5 of arg 1

## Warning in rbind(deparse.level, ...): number of columns of result, 3, is not a
## multiple of vector length 5 of arg 1

## Warning in rbind(deparse.level, ...): number of columns of result, 3, is not a
## multiple of vector length 5 of arg 1

## Warning in rbind(deparse.level, ...): number of columns of result, 3, is not a
## multiple of vector length 5 of arg 1

## Warning in rbind(deparse.level, ...): number of columns of result, 3, is not a
## multiple of vector length 5 of arg 1
# format table
res_restruct<- function(res){
  res1 <- lapply(res, as.data.frame)
  res1 <- do.call(rbind, res1)
  return(res1)
}

table3 <- res_restruct(list)
colnames(table3) <- c("Total (%)", "Day 1", "Day 2")

table3 <- knitr::kable(table3, digits = 0, align = "r") %>%
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

table3
Total (%) Day 1 Day 2
Sex
Female 5673 (48) 8 (4-11) 7 (4-11)
Male 6109 (52) 8 (5-12) 8 (5-11)
Age
<6mo 453 (4) 3 (1-6) 3 (1-6)
6-11mo 1013 (9) 7 (3-9) 6 (3.5-9)
1-4y 1129 (10) 5 (2-8.5) 5 (2-8.25)
5-9y 1268 (11) 8.5 (6-12) 9 (6-13)
10-14y 1266 (11) 10 (8-14) 10 (7-12)
15-19y 1532 (13) 12 (8-16.25) 11 (7-14)
20-29y 1292 (11) 9 (7-12.25) 9 (6-12)
30-39y 1193 (10) 8 (6-11) 8 (6-12)
40-59y 1804 (15) 10 (6-13) 8 (6-11)
60+y 832 (7) 5.5 (4-10) 6 (3-9)
Do you wear a mask?
Yes 8585 (73) 9 (6-12) 8 (6-12)
No 3197 (27) 5 (3-9) 5 (3-9)
No response 0 (0) NA NA
Household membership
Member 3965 (34) 9 (6-12) 8 (6-12)
Non-member 7817 (66) 3 (1-5) 2 (1-5.5)
Occupation
Child 1466 (14) 5 (2-8) 5 (2-8)
Unemployed 1569 (16) 7 (5-12) 7 (4-10)
Student 3290 (33) 10 (8-14) 9 (7-13)
Homemaker 137 (1) 9 (7-9) 6 (5-10)
Casual laboror 398 (4) 8 (6-11.5) 8 (6-10.75)
Farmer 1231 (12) 9 (6-12.5) 7 (6-10)
Fisherman 73 (1) 17.5 (14.25-20.75) 19 (14.5-23.5)
Business person 146 (1) 10 (7.5-10.5) 12 (7-14.5)
Office worker 524 (5) 8 (6-10.5) 8 (5-11)
Retired 58 (1) 5 (4-5) 7 (4-8)
Other 1226 (12) 10 (7-12.75) 9 (7-12.75)
11 NA (NA) 6 (3-9) 6 (2-9)
Weekday/Weekend
Weekday 8232 (70) 8 (5-12) 8 (5-11)
Weekend 3550 (30) 7 (3-10) 7 (4-10)
Enrolled in school
Yes1 3603 (32) 9 (7-13) 9 (6-13)
No1 7686 (68) 7 (4-10) 7 (3.5-10)
111 NA (NA) 8.5 (8-13.5) 10 (6.25-14)
Did you touch?
Yes2 7493 (64) 8 (4-11) 8 (4-11)
No2 4275 (36) 9 (5-11) 7 (5-10)
I don’t remember 12 (0) 14 (13-15) 5 (5-5)
112 NA (NA) 8 (8-8) NA
Contact location
Indoors 734 (6) 9.5 (7-10) 10 (7.5-16)
Outdoors 5790 (49) 7 (4-11) 6 (2-10)
Both 5257 (45) 8 (5-11) 8 (5-11)
Contact at home
Yes3 7297 (62) 8 (5-12) 8 (5-11)
No3 4485 (38) 6 (3-8) 6 (4-9.5)
Contact at work
Yes4 524 (4) 10.5 (7.75-14.75) 8 (7-12)
No4 11258 (96) 8 (5-11) 7 (4-11)
Contact at school
Yes5 389 (3) 8 (5-9.5) 10 (10-13)
No5 11393 (97) 8 (5-11) 7 (4-11)
Contact in other locations
Yes6 5774 (49) 8 (3-11) 7 (3-10)
No6 6008 (51) 8 (5-12) 8 (5-12)
Frequency of contact
Never met before 399 (3) 8.5 (7-10.75) 5.5 (4.75-7)
Rarely 452 (4) 6 (4.25-10) 4.5 (4-8.75)
Daily or almost daily 8953 (76) 8 (5-11) 8 (5-11)
1-3 times per week 1429 (12) 4 (4-7) 9 (2.5-11.5)
Once every 2 weeks 319 (3) 1 (1-1) 4 (2.5-5.5)
Once per month 181 (2) 7 (7-7) 7 (6-8)
Once every 3 months 47 (0) 4.5 (2.75-6.25) 2.5 (2.25-2.75)
Do you know the contact?
Never met before1 326 (8) 7 (7-7) 5 (4-10)
<1 yr 736 (17) 7.5 (4.25-11) 7 (4-9)
1-2 yrs 422 (10) 8 (6-14) 9 (6-15)
3-5 yrs 732 (17) 9 (7-11) 8 (6.5-10)
6-10 yrs 769 (18) 8 (6-12) 8 (5-12)
>10 yrs 1303 (30) 9 (7-13) 9 (6-12)
113 NA (NA) 3 (1-5) 2 (1-5)
Contact wearing mask
Yes7 1930 (16) 9 (6-13) 8 (5-13)
No7 9796 (83) 8 (5-11) 7 (4-11)
Can’t recall 56 (0) 10 (10-10) 7.5 (6.25-8.75)
ARI symptoms
No symptom 9583 (81) 8 (5-11) 7 (5-10)
>1 symptom 2199 (19) 8 (4-12.5) 8 (3-12.5)
AGE symptoms
Yes8 311 (3) 9 (8-13) 9 (8-16)
No8 11471 (97) 8 (5-11) 7 (4-11)
# urban contacts
df_contact_urban <- df_contact %>%
  dplyr::filter(study_site == "Urban")

list <- list(0)

for (i in 1:nrow(variables)){
  x <- df_contact_urban[,variables$var[[i]]] 
  # include another variable n as the number of participants per strata
  variables$var[[i]]
  
# Number and proportion of contacts in each strata
  
  t0 <- as.data.frame(cbind(table(x), # Total 
                            round(prop.table(table(x))*100, digits=0) # Proportion
                            )) # number of participants
 
  colnames(t0)[1:2] <- c("Total","Col") 
  Tot <- rep("",5)
  
# Median contacts for day 1
  t1 <- df_contact_d1 %>%
    dplyr::filter(study_site == "Urban") %>%
    # since this is total contacts per person, keep only 1 distinct record
    distinct(rec_id, .keep_all = TRUE) %>% 
    dplyr::group_by(.dots = variables$var[[i]]) %>% 
    # na.omit() %>%
    do(data.frame(t(quantile(.$num_contacts, na.rm=TRUE, probs=c(0.25,0.5,0.75))))) # Median and IQR      
  t1$med_contact <- as.character(paste(t1$X50.," (",t1$X25.,"-",t1$X75.,")",sep="")) # Formatting for export
  
  # Median contacts for day 2
  t2 <- df_contact_d2 %>%
    dplyr::filter(study_site == "Urban") %>%
    # since this is total contacts per person, keep only 1 distinct record
    distinct(rec_id, .keep_all = TRUE) %>% 
    dplyr::group_by(.dots = variables$var[[i]]) %>%
    # na.omit() %>%
    do(data.frame(t(quantile(.$num_contacts, na.rm=TRUE, probs=c(0.25,0.5,0.75))))) # Median and IQR      
  t2$med_contact <- as.character(paste(t2$X50.," (",t2$X25.,"-",t2$X75.,")",sep="")) # Formatting for export
  
  # Bind columns together and select relevant columns
  t0<-qpcR:::cbind.na(t0, t1$med_contact, t2$med_contact) 
  
  # convert to numerical and round off?
  
  t0<- t0[,c(1,2,3,4)]
  
  # rename total column
  t0$Total <- paste(t0$Total," (","", t0$Col,")",sep="")
  t0 <- t0[,c(1,3,4)]
  t0[,1]<-as.character(t0[,1])
  t0[,2]<-as.character(t0[,2])

  t0<-rbind(Tot, t0)
  
  rownames(t0)[1]<-variables$name[[i]]
  list[[i]] <- t0
}
## Warning in rbind(deparse.level, ...): number of columns of result, 3, is not a
## multiple of vector length 5 of arg 1

## Warning in rbind(deparse.level, ...): number of columns of result, 3, is not a
## multiple of vector length 5 of arg 1

## Warning in rbind(deparse.level, ...): number of columns of result, 3, is not a
## multiple of vector length 5 of arg 1

## Warning in rbind(deparse.level, ...): number of columns of result, 3, is not a
## multiple of vector length 5 of arg 1

## Warning in rbind(deparse.level, ...): number of columns of result, 3, is not a
## multiple of vector length 5 of arg 1

## Warning in rbind(deparse.level, ...): number of columns of result, 3, is not a
## multiple of vector length 5 of arg 1

## Warning in rbind(deparse.level, ...): number of columns of result, 3, is not a
## multiple of vector length 5 of arg 1

## Warning in rbind(deparse.level, ...): number of columns of result, 3, is not a
## multiple of vector length 5 of arg 1

## Warning in rbind(deparse.level, ...): number of columns of result, 3, is not a
## multiple of vector length 5 of arg 1

## Warning in rbind(deparse.level, ...): number of columns of result, 3, is not a
## multiple of vector length 5 of arg 1

## Warning in rbind(deparse.level, ...): number of columns of result, 3, is not a
## multiple of vector length 5 of arg 1

## Warning in rbind(deparse.level, ...): number of columns of result, 3, is not a
## multiple of vector length 5 of arg 1

## Warning in rbind(deparse.level, ...): number of columns of result, 3, is not a
## multiple of vector length 5 of arg 1

## Warning in rbind(deparse.level, ...): number of columns of result, 3, is not a
## multiple of vector length 5 of arg 1

## Warning in rbind(deparse.level, ...): number of columns of result, 3, is not a
## multiple of vector length 5 of arg 1

## Warning in rbind(deparse.level, ...): number of columns of result, 3, is not a
## multiple of vector length 5 of arg 1

## Warning in rbind(deparse.level, ...): number of columns of result, 3, is not a
## multiple of vector length 5 of arg 1

## Warning in rbind(deparse.level, ...): number of columns of result, 3, is not a
## multiple of vector length 5 of arg 1
# format table
res_restruct<- function(res){
  res1 <- lapply(res, as.data.frame)
  res1 <- do.call(rbind, res1)
  return(res1)
}

table3 <- res_restruct(list)
colnames(table3) <- c("Total (%)", "Day 1", "Day 2")

table3 <- knitr::kable(table3, digits = 0, align = "r") %>%
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

table3
Total (%) Day 1 Day 2
Sex
Female 3827 (47) 6 (3-8) 5 (3-8)
Male 4310 (53) 6 (4-9) 6 (4-8)
11 NA (NA) 5 (5-5) 5 (5-5)
Age
<6mo 477 (6) 4 (2-5) 3 (2.25-5)
6-11mo 467 (6) 4 (2-7) 3.5 (2-6)
1-4y 611 (7) 4 (2-6.75) 3 (2-7)
5-9y 779 (10) 6 (4-9) 6 (4-8.75)
10-14y 1055 (13) 8 (6-10) 7.5 (5.75-9)
15-19y 1042 (13) 9 (6-12) 8 (5-10)
20-29y 743 (9) 6 (4-7.25) 6 (4.75-8)
30-39y 805 (10) 6 (4-9) 6 (5-8)
40-59y 1497 (18) 6 (4-8) 5 (4-7)
60+y 671 (8) 5 (3-7) 5 (3-8)
Do you wear a mask?
Yes 6677 (82) 6 (4-9) 6 (4-9)
No 1462 (18) 4 (2-7) 4 (2-6)
No response 8 (0) 4 (4-4) 4 (4-4)
Household membership
Member 3724 (46) 6 (4-9) 6 (4-8)
Non-member 4423 (54) 2 (1-5) 2 (1-5)
Occupation
Child 944 (13) 4 (2-6) 3 (2-5)
Unemployed 833 (11) 6 (4-9) 6 (3-9)
Student 2654 (36) 7 (5-10) 7 (5-9)
Homemaker 305 (4) 6 (4.75-8) 6 (5-7.25)
Casual laboror 660 (9) 5 (4-7) 5 (4-7.25)
Farmer 52 (1) 3.5 (3-4) 3.5 (1.5-8.5)
Fisherman 0 (0) 6 (4-8) 6 (4-8)
Business person 727 (10) 6 (4-8) 6 (4-8)
Office worker 737 (10) 4 (3-5.5) 5 (2.5-6)
Retired 150 (2) 9 (8.25-10) 9 (6-10)
Other 229 (3) 5 (2-7) 4 (2-7)
Weekday/Weekend
Weekday 5980 (74) 6 (4-8) 6 (3-8)
Weekend 2138 (26) 5 (3-8) 5 (3-7)
111 NA (NA) 12.5 (9.25-15.75) 9.5 (7.25-11.75)
Enrolled in school
Yes1 2926 (37) 7 (5-9) 7 (5-9)
No1 4960 (63) 5 (3-8) 5 (3-7.25)
112 NA (NA) 6 (4-6) 5 (3-9)
Did you touch?
Yes2 5930 (73) 5 (3-8) 5 (3-8)
No2 2198 (27) 7 (5-8) 6 (4.75-9)
I don’t remember 18 (0) 7 (6.5-7.5) 3 (2.5-3.5)
Contact location
Indoors 926 (11) 7 (4-9) 6 (4-8)
Outdoors 2485 (31) 5 (3-8) 4 (2.25-7.75)
Both 4736 (58) 6 (3.75-8) 5 (3-8)
Contact at home
Yes3 5621 (69) 6 (4-8) 5 (3-8)
No3 2526 (31) 5 (3-7.5) 5 (3-7.25)
Contact at work
Yes4 357 (4) 9 (9-9) 4 (3-6.75)
No4 7790 (96) 6 (4-8) 5 (3-8)
Contact at school
Yes5 260 (3) 4.5 (2-8.75) 4 (3-4)
No5 7887 (97) 6 (4-8) 5 (3-8)
Contact in other locations
Yes6 3051 (37) 6 (4-8.5) 5 (3-8)
No6 5096 (63) 6 (4-8) 5.5 (3-8)
Frequency of contact
Never met before 276 (3) 5 (1-6) 7 (5-8)
Rarely 302 (4) 5 (4-9) 4 (3.5-5.75)
Daily or almost daily 6413 (79) 6 (4-8) 5 (3-8)
1-3 times per week 930 (11) 4 (2.5-6.5) 4 (3-7)
Once every 2 weeks 149 (2) 6 (4-6) 6 (2.25-7.5)
Once per month 58 (1) 4.5 (3.75-5.25) 4.5 (3.75-5.25)
Once every 3 months 18 (0) NA 3 (3-3)
Do you know the contact?
Never met before1 272 (7) 5 (1-6) 5 (5-7)
<1 yr 587 (15) 4 (3-6.75) 4 (3-6)
1-2 yrs 304 (8) 6 (4-9) 6 (4-9)
3-5 yrs 359 (9) 4 (3-6) 5 (3-8)
6-10 yrs 699 (17) 6 (5-9.25) 6 (4-8)
>10 yrs 1775 (44) 7 (4-9) 6 (5-9)
113 NA (NA) 2 (1-5) 2 (1-4)
Contact wearing mask
Yes7 2170 (27) 6 (4-8) 5 (4-7)
No7 5958 (73) 6 (4-9) 6 (3-8)
Can’t recall 19 (0) 5 (5-5) 8 (7.5-8.5)
ARI symptoms
No symptom 6685 (82) 6 (3-8) 5 (3-8)
>1 symptom 1462 (18) 6 (4-8) 6 (4-8)
AGE symptoms
Yes8 124 (2) 6 (4-8) 6 (5-6.75)
No8 8023 (98) 6 (4-8) 5 (3-8)
contacts_age <- df_contact_d1 %>%
  dplyr::group_by(rec_id, study_site, participant_age) %>%   
  # group by id and count number of df_contact_d1
  dplyr::summarize(num_contacts = n())
## `summarise()` has grouped output by 'rec_id', 'study_site'. You can override
## using the `.groups` argument.
# contacts by site
contacts_site_d1 <- df_contact_d1 %>%
  dplyr::group_by(rec_id, study_site) %>%
  dplyr::summarize(num_contacts = n())
## `summarise()` has grouped output by 'rec_id'. You can override using the
## `.groups` argument.
median_contacts_site_d1 <- contacts_site_d1 %>%
  dplyr::group_by(study_site) %>%
  dplyr::summarize(q25 = quantile(num_contacts, probs = 0.25),
                   q50 = quantile(num_contacts, probs = 0.5),
                   q75 = quantile(num_contacts, probs = 0.75))

mean_contacts_site_d1 <- contacts_site_d1 %>%
  dplyr::group_by(study_site) %>%
  dplyr::summarize(mean = round(mean(num_contacts),1))

ci_contacts_rural_d1 <- contacts_site_d1 %>%
  dplyr::filter(study_site == "Rural")
ci_contacts_rural_d1 <- lm(num_contacts ~ 1, ci_contacts_rural_d1)
ci_contacts_rural_d1 <- as.data.frame(round(confint(ci_contacts_rural_d1),1))

ci_contacts_urban_d1 <- contacts_site_d1 %>%
  dplyr::filter(study_site == "Urban")
ci_contacts_urban_d1 <- lm(num_contacts ~ 1, ci_contacts_urban_d1)
ci_contacts_urban_d1 <- as.data.frame(round(confint(ci_contacts_urban_d1),1))

# title
title_rural_d1 <- list(
  text = "Rural",
  size=14,
  # font = f,
  xref = "paper", yref = "paper",
  yanchor = "bottom", xanchor = "center",
  align = "center",
  x = 0.5, y = 1,
  showarrow = FALSE
)

title_urban_d1 <- list(
  text = "Urban",
  size=14,
  # font = f,
  xref = "paper", yref = "paper",
  yanchor = "bottom", xanchor = "center",
  align = "center",
  x = 0.5, y = 1,
  showarrow = FALSE
)
hist_rural_d1 <- contacts_site_d1 %>%
  dplyr::filter(study_site == "Rural") %>%
  plotly::plot_ly(x=~num_contacts, colors="#a6611a") %>%
  plotly::add_lines(y = range(0:100),
            x = mean_contacts_site_d1$mean[1],
            line = list(color = "black"),
            showlegend = FALSE) %>%
  add_lines(y = range(0:100),
            x = median_contacts_site_d1$q50[1],
            line = list(color = "grey"),
            showlegend = FALSE) %>%
  plotly::add_annotations(x = mean_contacts_site_d1$mean[1], 
                  y = 90,
                  text = paste(mean_contacts_site_d1$mean[1], " (95% CI ",
                               ci_contacts_rural_d1$`2.5 %`,"-",
                               ci_contacts_rural_d1$`97.5 %`,")", sep=""), 
                  showarrow = F, xanchor = "left") %>%
  add_annotations(x = median_contacts_site_d1$q50[1], 
                  y = 90,
                  text = paste(median_contacts_site_d1$q50[1], " (95% CI ",
                               median_contacts_site_d1$q25[1],"-",
                               median_contacts_site_d1$q75[1],")", sep=""), 
                  showarrow = F, xanchor = "right") %>%
  plotly::layout(annotations = title_rural_d1,
         xaxis = list(title = "Num of contacts")) %>%
  plotly::add_histogram(name="Rural")
# hist_rural

hist_urban_d1 <- contacts_site_d1 %>%
  dplyr::filter(study_site == "Urban") %>%
  plotly::plot_ly(x=~num_contacts, colors="#018571") %>%
  add_lines(y = range(0:100),
            x = mean_contacts_site_d1$mean[2],
            line = list(color = "black"),
            showlegend = FALSE) %>%
  add_lines(y = range(0:100),
            x = median_contacts_site_d1$q50[2],
            line = list(color = "grey"),
            showlegend = FALSE) %>%
  add_annotations(x = mean_contacts_site_d1$mean[2], 
                  y = 90,
                  text = paste(mean_contacts_site_d1$mean[2], " (95% CI ",
                               ci_contacts_urban_d1$`2.5 %`,"-",
                               ci_contacts_urban_d1$`97.5 %`,")", sep=""), 
                  showarrow = F, xanchor = "left") %>%
  add_annotations(x = median_contacts_site_d1$q50[2], 
                  y = 90,
                  text = paste(median_contacts_site_d1$q50[2], " (95% CI ",
                               median_contacts_site_d1$q25[2],"-",
                               median_contacts_site_d1$q75[2],")", sep=""), 
                  showarrow = F, xanchor = "right") %>%
  layout(annotations = title_urban_d1,
         xaxis = list(title = "Num of contacts")) %>%
  plotly::add_histogram(name="Urban")

fig1_hist_d1 <- subplot(hist_rural_d1, hist_urban_d1, 
                        nrows=2, shareX = TRUE) %>% 
  layout(title = 'Day 1 Contact Distributions')

fig1_hist_d1
# urban contacts
df_contact_urban <- df_contact %>%
  dplyr::filter(study_site == "Urban")


contacts_age <- df_contact_d2 %>%
  dplyr::group_by(rec_id, study_site, participant_age) %>%   
  # group by id and count number of df_contact_d2
  dplyr::summarize(num_contacts = n())
## `summarise()` has grouped output by 'rec_id', 'study_site'. You can override
## using the `.groups` argument.
# contacts by site
contacts_site_d2 <- df_contact_d2 %>%
  dplyr::group_by(rec_id, study_site) %>%
  dplyr::summarize(num_contacts = n())
## `summarise()` has grouped output by 'rec_id'. You can override using the
## `.groups` argument.
median_contacts_site_d2 <- contacts_site_d2 %>%
  dplyr::group_by(study_site) %>%
  dplyr::summarize(q25 = quantile(num_contacts, probs = 0.25),
                   q50 = quantile(num_contacts, probs = 0.5),
                   q75 = quantile(num_contacts, probs = 0.75))

mean_contacts_site_d2 <- contacts_site_d2 %>%
  dplyr::group_by(study_site) %>%
  dplyr::summarize(mean = round(mean(num_contacts),1))

ci_contacts_rural_d2 <- contacts_site_d2 %>%
  dplyr::filter(study_site == "Rural")
ci_contacts_rural_d2 <- lm(num_contacts ~ 1, ci_contacts_rural_d2)
ci_contacts_rural_d2 <- as.data.frame(round(confint(ci_contacts_rural_d2),1))

ci_contacts_urban_d2 <- contacts_site_d2 %>%
  dplyr::filter(study_site == "Urban")
ci_contacts_urban_d2 <- lm(num_contacts ~ 1, ci_contacts_urban_d2)
ci_contacts_urban_d2 <- as.data.frame(round(confint(ci_contacts_urban_d2),1))

# title
title_rural_d2 <- list(
  text = "Rural",
  size=14,
  # font = f,
  xref = "paper", yref = "paper",
  yanchor = "bottom", xanchor = "center",
  align = "center",
  x = 0.5, y = 1,
  showarrow = FALSE
)

title_urban_d2 <- list(
  text = "Urban",
  size=14,
  # font = f,
  xref = "paper", yref = "paper",
  yanchor = "bottom", xanchor = "center",
  align = "center",
  x = 0.5, y = 1,
  showarrow = FALSE
)
hist_rural_d2 <- contacts_site_d2 %>%
  dplyr::filter(study_site == "Rural") %>%
  plotly::plot_ly(x=~num_contacts, colors="#a6611a") %>%
  plotly::add_lines(y = range(0:100),
            x = mean_contacts_site_d2$mean[1],
            line = list(color = "black"),
            showlegend = FALSE) %>%
  add_lines(y = range(0:100),
            x = median_contacts_site_d2$q50[1],
            line = list(color = "grey"),
            showlegend = FALSE) %>%
  plotly::add_annotations(x = mean_contacts_site_d2$mean[1], 
                  y = 90,
                  text = paste(mean_contacts_site_d2$mean[1], " (95% CI ",
                               ci_contacts_rural_d2$`2.5 %`,"-",
                               ci_contacts_rural_d2$`97.5 %`,")", sep=""), 
                  showarrow = F, xanchor = "left") %>%
  add_annotations(x = median_contacts_site_d2$q50[1], 
                  y = 90,
                  text = paste(median_contacts_site_d2$q50[1], " (95% CI ",
                               median_contacts_site_d2$q25[1],"-",
                               median_contacts_site_d2$q75[1],")", sep=""), 
                  showarrow = F, xanchor = "right") %>%
  plotly::layout(annotations = title_rural_d2,
         xaxis = list(title = "Num of contacts")) %>%
  plotly::add_histogram(name="Rural")
# hist_rural

hist_urban_d2 <- contacts_site_d2 %>%
  dplyr::filter(study_site == "Urban") %>%
  plotly::plot_ly(x=~num_contacts, colors="#018571") %>%
  add_lines(y = range(0:100),
            x = mean_contacts_site_d2$mean[2],
            line = list(color = "black"),
            showlegend = FALSE) %>%
  add_lines(y = range(0:100),
            x = median_contacts_site_d2$q50[2],
            line = list(color = "grey"),
            showlegend = FALSE) %>%
  add_annotations(x = mean_contacts_site_d2$mean[2], 
                  y = 90,
                  text = paste(mean_contacts_site_d2$mean[2], " (95% CI ",
                               ci_contacts_urban_d2$`2.5 %`,"-",
                               ci_contacts_urban_d2$`97.5 %`,")", sep=""), 
                  showarrow = F, xanchor = "left") %>%
  add_annotations(x = median_contacts_site_d2$q50[2], 
                  y = 90,
                  text = paste(median_contacts_site_d2$q50[2], " (95% CI ",
                               median_contacts_site_d2$q25[2],"-",
                               median_contacts_site_d2$q75[2],")", sep=""), 
                  showarrow = F, xanchor = "right") %>%
  layout(annotations = title_urban_d2,
         xaxis = list(title = "Num of contacts")) %>%
  plotly::add_histogram(name="Urban")

fig1_hist_d2 <- subplot(hist_rural_d2, hist_urban_d2, 
                        nrows=2, shareX = TRUE) %>% 
  layout(title = 'Day 2 Contact Distributions')

fig1_hist_d2
# urban contacts
df_contact_urban <- df_contact %>%
  dplyr::filter(study_site == "Urban")

# contacts by site
contacts_site <- df_contact %>%
  dplyr::group_by(rec_id, study_site, study_day, age, participant_sex) %>%
  dplyr::summarize(num_contacts = n())
## `summarise()` has grouped output by 'rec_id', 'study_site', 'study_day', 'age'.
## You can override using the `.groups` argument.
median_contacts_site <- contacts_site %>%
  dplyr::group_by(study_site) %>%
  dplyr::summarize(q25 = quantile(num_contacts, probs = 0.25),
                   q50 = quantile(num_contacts, probs = 0.5),
                   q75 = quantile(num_contacts, probs = 0.75))

mean_contacts_site <- contacts_site %>%
  dplyr::group_by(study_site) %>%
  dplyr::summarize(mean = round(mean(num_contacts),1))

ci_contacts_rural <- contacts_site %>%
  dplyr::filter(study_site == "Rural")
ci_contacts_rural <- lm(num_contacts ~ 1, ci_contacts_rural)
ci_contacts_rural <- as.data.frame(round(confint(ci_contacts_rural),1))

ci_contacts_urban <- contacts_site %>%
  dplyr::filter(study_site == "Urban")
ci_contacts_urban <- lm(num_contacts ~ 1, ci_contacts_urban)
ci_contacts_urban <- as.data.frame(round(confint(ci_contacts_urban),1))

# title
title_rural <- list(
  text = "Rural",
  size=14,
  # font = f,
  xref = "paper", yref = "paper",
  yanchor = "bottom", xanchor = "center",
  align = "center",
  x = 0.5, y = 1,
  showarrow = FALSE
)

title_urban <- list(
  text = "Urban",
  size=14,
  # font = f,
  xref = "paper", yref = "paper",
  yanchor = "bottom", xanchor = "center",
  align = "center",
  x = 0.5, y = 1,
  showarrow = FALSE
)
hist_rural <- contacts_site %>%
  dplyr::filter(study_site == "Rural") %>%
  plotly::plot_ly(x=~num_contacts, colors="#a6611a") %>%
  plotly::add_lines(y = range(0:150),
            x = mean_contacts_site$mean[1],
            line = list(color = "black"),
            showlegend = FALSE) %>%
  add_lines(y = range(0:150),
            x = median_contacts_site$q50[1],
            line = list(color = "grey"),
            showlegend = FALSE) %>%
  plotly::add_annotations(x = mean_contacts_site$mean[1], 
                  y = 90,
                  text = paste(mean_contacts_site$mean[1], " (95% CI ",
                               ci_contacts_rural$`2.5 %`,"-",
                               ci_contacts_rural$`97.5 %`,")", sep=""), 
                  showarrow = F, xanchor = "left") %>%
  add_annotations(x = median_contacts_site$q50[1], 
                  y = 90,
                  text = paste(median_contacts_site$q50[1], " (95% CI ",
                               median_contacts_site$q25[1],"-",
                               median_contacts_site$q75[1],")", sep=""), 
                  showarrow = F, xanchor = "right") %>%
  plotly::layout(annotations = title_rural,
         xaxis = list(title = "Num of contacts")) %>%
  plotly::add_histogram(name="Rural")
# hist_rural

hist_urban <- contacts_site %>%
  dplyr::filter(study_site == "Urban") %>%
  plotly::plot_ly(x=~num_contacts, colors="#018571") %>%
  add_lines(y = range(0:150),
            x = mean_contacts_site$mean[2],
            line = list(color = "black"),
            showlegend = FALSE) %>%
  add_lines(y = range(0:150),
            x = median_contacts_site$q50[2],
            line = list(color = "grey"),
            showlegend = FALSE) %>%
  add_annotations(x = mean_contacts_site$mean[2], 
                  y = 90,
                  text = paste(mean_contacts_site$mean[2], " (95% CI ",
                               ci_contacts_urban$`2.5 %`,"-",
                               ci_contacts_urban$`97.5 %`,")", sep=""), 
                  showarrow = F, xanchor = "left") %>%
  add_annotations(x = median_contacts_site$q50[2], 
                  y = 90,
                  text = paste(median_contacts_site$q50[2], " (95% CI ",
                               median_contacts_site$q25[2],"-",
                               median_contacts_site$q75[2],")", sep=""), 
                  showarrow = F, xanchor = "right") %>%
  layout(annotations = title_urban,
         xaxis = list(title = "Num of contacts")) %>%
  plotly::add_histogram(name="Urban")

fig1_hist <- subplot(hist_rural, hist_urban, 
                        nrows=2, shareX = TRUE) %>% 
  layout(title = 'Contact Distributions - Both Days')

fig1_hist

Number of Contacts Analysis

Questions: - Should we have separate urban and rural models? - Model should include random effects (lme4) for - Negative binomial mixed model not converging at all (no interaction) - Poisson MM is working fine with no interaction - Poisson MM NOT working when including interaction terms - Linear MM works fine with and without interaction

library(lme4)
## Loading required package: Matrix
require(MASS)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:plotly':
## 
##     select
## The following object is masked from 'package:dplyr':
## 
##     select
library(lmerTest)
## 
## Attaching package: 'lmerTest'
## The following object is masked from 'package:lme4':
## 
##     lmer
## The following object is masked from 'package:stats':
## 
##     step
# negbin_model <- glmer.nb(num_contacts ~ s_age + participant_sex +
#                          study_site + (1 | rec_id),
#                          offset = log(N),
#                          verbose = TRUE,
#                          data = df_contact %>% distinct())
# summary(negbin_model)

# nb_int_model <- glmer.nb(num_contacts ~ s_age + participant_sex +
#                          study_site + participant_sex*study_site +
#                          s_age*study_site + s_age*participant_sex+ 
#                          (1 | rec_id),
#                          offset = log(N),
#                          verbose = TRUE,
#                          data = df_contact %>% distinct())
# summary(nb_int_model)

df_contact$N <- sum(df_contact$num_contacts)
df_contact$s_age <- scale(as.numeric(df_contact$participant_age))

poisson_model <- glmer(num_contacts ~ s_age + participant_sex + 
                         study_site + (1 | rec_id),
                      offset = log(N),
                      data = df_contact %>% distinct(),
                      family = poisson(link = "log"))

summary(poisson_model)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: poisson  ( log )
## Formula: num_contacts ~ s_age + participant_sex + study_site + (1 | rec_id)
##    Data: df_contact %>% distinct()
##  Offset: log(N)
## 
##      AIC      BIC   logLik deviance df.resid 
##  87517.3  87556.4 -43753.6  87507.3    18454 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.8453 -0.2591  0.0239  0.3518  1.3139 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  rec_id (Intercept) 0.312    0.5586  
## Number of obs: 18459, groups:  rec_id, 1363
## 
## Fixed effects:
##                      Estimate Std. Error  z value Pr(>|z|)    
## (Intercept)         -10.24995    0.02741 -373.887   <2e-16 ***
## s_age                 0.13898    0.01455    9.552   <2e-16 ***
## participant_sexMale   0.06213    0.03154    1.970   0.0489 *  
## study_siteUrban      -0.30971    0.03159   -9.804   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) s_age  prtc_M
## s_age        0.069              
## prtcpnt_sxM -0.590  0.009       
## stdy_stUrbn -0.560 -0.055  0.000
poi_int_model <- glmer(num_contacts ~ s_age + participant_sex +
                         study_site + participant_sex*study_site +
                         s_age*study_site + s_age*participant_sex +
                         (1 | rec_id),
                      offset = log(N),
                      data = df_contact %>% distinct(),
                      family = poisson(link = "log"))
summary(poi_int_model)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: poisson  ( log )
## Formula: 
## num_contacts ~ s_age + participant_sex + study_site + participant_sex *  
##     study_site + s_age * study_site + s_age * participant_sex +  
##     (1 | rec_id)
##    Data: df_contact %>% distinct()
##  Offset: log(N)
## 
##      AIC      BIC   logLik deviance df.resid 
##  87518.1  87580.7 -43751.1  87502.1    18451 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.8457 -0.2590  0.0246  0.3502  1.3146 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  rec_id (Intercept) 0.3111   0.5577  
## Number of obs: 18459, groups:  rec_id, 1363
## 
## Fixed effects:
##                                       Estimate Std. Error  z value Pr(>|z|)    
## (Intercept)                         -10.233229   0.031520 -324.659  < 2e-16 ***
## s_age                                 0.170134   0.024894    6.834 8.23e-12 ***
## participant_sexMale                   0.038088   0.044083    0.864   0.3876    
## study_siteUrban                      -0.343044   0.045185   -7.592 3.15e-14 ***
## participant_sexMale:study_siteUrban   0.055007   0.063171    0.871   0.3839    
## s_age:study_siteUrban                -0.060714   0.029054   -2.090   0.0366 *  
## s_age:participant_sexMale            -0.003907   0.029052   -0.134   0.8930    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) s_age  prtc_M stdy_U p_M:_U s_g:_U
## s_age        0.070                                   
## prtcpnt_sxM -0.710 -0.014                            
## stdy_stUrbn -0.693 -0.037  0.492                     
## prtcpn_M:_U  0.494  0.001 -0.696 -0.713              
## s_g:stdy_sU -0.039 -0.552 -0.040  0.053  0.008       
## s_g:prtcp_M -0.042 -0.575  0.099  0.004 -0.053 -0.037
linear_model <- lmer(num_contacts ~ s_age + participant_sex +
                         study_site + (1 | rec_id),
                      offset = log(N),
                      data = df_contact %>% distinct())

summary(linear_model)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: num_contacts ~ s_age + participant_sex + study_site + (1 | rec_id)
##    Data: df_contact %>% distinct()
##  Offset: log(N)
## 
## REML criterion at convergence: 89179.8
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -11.7125  -0.2591   0.0205   0.4393   2.9561 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  rec_id   (Intercept) 20.334   4.509   
##  Residual              5.577   2.362   
## Number of obs: 18459, groups:  rec_id, 1363
## 
## Fixed effects:
##                      Estimate Std. Error        df t value Pr(>|t|)    
## (Intercept)           -3.3173     0.2160 1356.0778 -15.358  < 2e-16 ***
## s_age                  0.8792     0.1131 1383.4643   7.777 1.45e-14 ***
## participant_sexMale    0.3411     0.2484 1362.4466   1.373     0.17    
## study_siteUrban       -2.6351     0.2488 1362.2500 -10.592  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) s_age  prtc_M
## s_age        0.091              
## prtcpnt_sxM -0.589  0.005       
## stdy_stUrbn -0.568 -0.055  0.005
lin_int_model <- lmer(num_contacts ~ s_age + participant_sex +
                        study_site + participant_sex*study_site +
                        s_age*study_site + s_age*participant_sex +
                        (1 | rec_id),
                      offset = log(N),
                      data = df_contact %>% distinct())

summary(lin_int_model)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: 
## num_contacts ~ s_age + participant_sex + study_site + participant_sex *  
##     study_site + s_age * study_site + s_age * participant_sex +  
##     (1 | rec_id)
##    Data: df_contact %>% distinct()
##  Offset: log(N)
## 
## REML criterion at convergence: 89171.5
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -11.7138  -0.2580   0.0214   0.4374   2.9565 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  rec_id   (Intercept) 20.232   4.498   
##  Residual              5.577   2.362   
## Number of obs: 18459, groups:  rec_id, 1363
## 
## Fixed effects:
##                                       Estimate Std. Error         df t value
## (Intercept)                           -3.20015    0.24884 1351.41118 -12.860
## s_age                                  1.20955    0.19383 1379.70405   6.240
## participant_sexMale                    0.23656    0.34896 1349.86403   0.678
## study_siteUrban                       -2.87260    0.35489 1362.69733  -8.094
## participant_sexMale:study_siteUrban    0.30003    0.49702 1360.04831   0.604
## s_age:study_siteUrban                 -0.70700    0.22596 1381.52334  -3.129
## s_age:participant_sexMale              0.03264    0.22597 1381.59210   0.144
##                                     Pr(>|t|)    
## (Intercept)                          < 2e-16 ***
## s_age                               5.80e-10 ***
## participant_sexMale                  0.49796    
## study_siteUrban                     1.26e-15 ***
## participant_sexMale:study_siteUrban  0.54618    
## s_age:study_siteUrban                0.00179 ** 
## s_age:participant_sexMale            0.88516    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) s_age  prtc_M stdy_U p_M:_U s_g:_U
## s_age        0.093                                   
## prtcpnt_sxM -0.709 -0.023                            
## stdy_stUrbn -0.699 -0.045  0.493                     
## prtcpn_M:_U  0.496  0.001 -0.699 -0.710              
## s_g:stdy_sU -0.052 -0.560 -0.041  0.072  0.005       
## s_g:prtcp_M -0.053 -0.572  0.116  0.002 -0.054 -0.031

Outlier analysis using mean as cut off

contact_summaries <- df_contact %>% 
  distinct() %>%
  ungroup() %>%
  dplyr::summarize(q25 = quantile(num_contacts, probs = 0.25),
                   q50 = quantile(num_contacts, probs = 0.5),
                   q75 = quantile(num_contacts, probs = 0.75),
                   q90 = quantile(num_contacts, probs = 0.90),
                   mean = mean(num_contacts))

df_contact$mean_outlier <- if_else(df_contact$num_contacts > contact_summaries$mean, 1, 0)
df_contact$q75_outlier <- if_else(df_contact$num_contacts > contact_summaries$q75, 1, 0)

mean_outlier_model <- glmer(mean_outlier ~ s_age + participant_sex +
                         study_site + (1 | rec_id),
                      data = df_contact %>% distinct(),
                      family = binomial(link = "logit"))

summary(mean_outlier_model)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: mean_outlier ~ s_age + participant_sex + study_site + (1 | rec_id)
##    Data: df_contact %>% distinct()
## 
##      AIC      BIC   logLik deviance df.resid 
##   7027.3   7066.5  -3508.7   7017.3    18454 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -3.04757 -0.00489 -0.00256  0.04458  1.00437 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  rec_id (Intercept) 308.1    17.55   
## Number of obs: 18459, groups:  rec_id, 1363
## 
## Fixed effects:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -10.5386     0.4544 -23.190  < 2e-16 ***
## s_age                 0.1226     0.2185   0.561  0.57452    
## participant_sexMale   0.2063     0.4826   0.428  0.66898    
## study_siteUrban      -1.4321     0.5205  -2.751  0.00593 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) s_age  prtc_M
## s_age        0.049              
## prtcpnt_sxM -0.565  0.038       
## stdy_stUrbn -0.375 -0.124 -0.014
mean_out_int_model <- glmer(mean_outlier ~ s_age + participant_sex +
                        study_site + participant_sex*study_site +
                        s_age*study_site + s_age*participant_sex +
                        (1 | rec_id),
                      data = df_contact %>% distinct(),
                      family = binomial(link = "logit"))
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge with max|grad| = 5.90394 (tol = 0.002, component 1)
summary(mean_out_int_model)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## mean_outlier ~ s_age + participant_sex + study_site + participant_sex *  
##     study_site + s_age * study_site + s_age * participant_sex +  
##     (1 | rec_id)
##    Data: df_contact %>% distinct()
## 
##      AIC      BIC   logLik deviance df.resid 
##   7140.2   7202.8  -3562.1   7124.2    18451 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.99574 -0.01248 -0.00599  0.07085  1.01126 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  rec_id (Intercept) 101.3    10.07   
## Number of obs: 18459, groups:  rec_id, 1363
## 
## Fixed effects:
##                                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                          -8.8257     0.4584 -19.253  < 2e-16 ***
## s_age                                 0.8870     0.3236   2.741  0.00612 ** 
## participant_sexMale                   0.7274     0.5261   1.383  0.16677    
## study_siteUrban                      -1.2865     0.6079  -2.116  0.03434 *  
## participant_sexMale:study_siteUrban  -0.5008     0.8245  -0.607  0.54363    
## s_age:study_siteUrban                -0.7795     0.3692  -2.111  0.03476 *  
## s_age:participant_sexMale            -0.6835     0.3729  -1.833  0.06681 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) s_age  prtc_M stdy_U p_M:_U s_g:_U
## s_age       -0.172                                   
## prtcpnt_sxM -0.680  0.159                            
## stdy_stUrbn -0.622  0.086  0.497                     
## prtcpn_M:_U  0.455 -0.137 -0.641 -0.747              
## s_g:stdy_sU  0.076 -0.466 -0.140 -0.085  0.226       
## s_g:prtcp_M  0.158 -0.686 -0.003 -0.051  0.055  0.003
## optimizer (Nelder_Mead) convergence code: 0 (OK)
## Model failed to converge with max|grad| = 5.90394 (tol = 0.002, component 1)

Outlier analysis with 75th %ile as cut off

q75_outlier_model <- glmer(q75_outlier ~ s_age + participant_sex +
                         study_site + (1 | rec_id),
                      data = df_contact %>% distinct(),
                      family = binomial(link = "logit"))

summary(q75_outlier_model)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: q75_outlier ~ s_age + participant_sex + study_site + (1 | rec_id)
##    Data: df_contact %>% distinct()
## 
##      AIC      BIC   logLik deviance df.resid 
##   5878.4   5917.5  -2934.2   5868.4    18454 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -3.03578 -0.00396 -0.00221 -0.00181  1.04868 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  rec_id (Intercept) 237.1    15.4    
## Number of obs: 18459, groups:  rec_id, 1363
## 
## Fixed effects:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -11.01679    0.50567 -21.787   <2e-16 ***
## s_age                 0.13066    0.25864   0.505   0.6134    
## participant_sexMale   0.08093    0.55152   0.147   0.8833    
## study_siteUrban      -1.35364    0.63378  -2.136   0.0327 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) s_age  prtc_M
## s_age       -0.003              
## prtcpnt_sxM -0.570  0.039       
## stdy_stUrbn -0.317 -0.079 -0.017
q75_out_int_model <- glmer(q75_outlier ~ s_age + participant_sex +
                        study_site + participant_sex*study_site +
                        s_age*study_site + s_age*participant_sex +
                        (1 | rec_id),
                      data = df_contact %>% distinct(),
                      family = binomial(link = "logit"))

summary(q75_out_int_model)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## q75_outlier ~ s_age + participant_sex + study_site + participant_sex *  
##     study_site + s_age * study_site + s_age * participant_sex +  
##     (1 | rec_id)
##    Data: df_contact %>% distinct()
## 
##      AIC      BIC   logLik deviance df.resid 
##   5884.3   5946.8  -2934.1   5868.3    18451 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -3.03562 -0.00393 -0.00213 -0.00194  1.04870 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  rec_id (Intercept) 236.4    15.37   
## Number of obs: 18459, groups:  rec_id, 1363
## 
## Fixed effects:
##                                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                         -11.02212    0.53289 -20.684   <2e-16 ***
## s_age                                 0.13742    0.39408   0.349    0.727    
## participant_sexMale                   0.10422    0.63852   0.163    0.870    
## study_siteUrban                      -1.29950    0.92039  -1.412    0.158    
## participant_sexMale:study_siteUrban  -0.07876    1.26795  -0.062    0.950    
## s_age:study_siteUrban                -0.24016    0.59698  -0.402    0.687    
## s_age:participant_sexMale             0.09842    0.51613   0.191    0.849    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) s_age  prtc_M stdy_U p_M:_U s_g:_U
## s_age       -0.031                                   
## prtcpnt_sxM -0.626  0.031                            
## stdy_stUrbn -0.442  0.035  0.365                     
## prtcpn_M:_U  0.315 -0.019 -0.504 -0.726              
## s_g:stdy_sU  0.006 -0.348 -0.011  0.026 -0.036       
## s_g:prtcp_M  0.030 -0.655 -0.019 -0.035  0.015 -0.045

Outlier analysis with 90th %ile as cut off

df_contact$q90_outlier <- if_else(df_contact$num_contacts > contact_summaries$q90, 1, 0)

q90_outlier_model <- glmer(q90_outlier ~ s_age + participant_sex +
                          study_site +(1 | rec_id),
                      data = df_contact %>% distinct(),
                      family = binomial(link = "logit"))

summary(q90_outlier_model)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: q90_outlier ~ s_age + participant_sex + study_site + (1 | rec_id)
##    Data: df_contact %>% distinct()
## 
##      AIC      BIC   logLik deviance df.resid 
##   2588.6   2627.7  -1289.3   2578.6    18454 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -3.03333 -0.00193 -0.00146 -0.00081  1.00288 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  rec_id (Intercept) 249.1    15.78   
## Number of obs: 18459, groups:  rec_id, 1363
## 
## Fixed effects:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -12.47901    0.81198 -15.369   <2e-16 ***
## s_age                 0.26310    0.46740   0.563    0.574    
## participant_sexMale  -0.07088    0.93129  -0.076    0.939    
## study_siteUrban      -1.66910    1.30149  -1.282    0.200    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) s_age  prtc_M
## s_age       -0.115              
## prtcpnt_sxM -0.581  0.033       
## stdy_stUrbn -0.228 -0.044 -0.009
q90_out_int_model <- glmer(q90_outlier ~ s_age + participant_sex +
                        study_site + participant_sex*study_site +
                        s_age*study_site + s_age*participant_sex +
                        (1 | rec_id),
                      data = df_contact %>% distinct(),
                      family = binomial(link = "logit"))

summary(q90_out_int_model)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## q90_outlier ~ s_age + participant_sex + study_site + participant_sex *  
##     study_site + s_age * study_site + s_age * participant_sex +  
##     (1 | rec_id)
##    Data: df_contact %>% distinct()
## 
##      AIC      BIC   logLik deviance df.resid 
##   2594.4   2657.0  -1289.2   2578.4    18451 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -3.03323 -0.00192 -0.00135 -0.00080  1.00287 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  rec_id (Intercept) 248.1    15.75   
## Number of obs: 18459, groups:  rec_id, 1363
## 
## Fixed effects:
##                                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                         -12.48407    0.84496 -14.775   <2e-16 ***
## s_age                                 0.35327    0.68893   0.513    0.608    
## participant_sexMale                  -0.07991    1.03104  -0.078    0.938    
## study_siteUrban                      -1.70445    1.92519  -0.885    0.376    
## participant_sexMale:study_siteUrban   0.15749    2.62135   0.060    0.952    
## s_age:study_siteUrban                -0.55342    1.26942  -0.436    0.663    
## s_age:participant_sexMale            -0.01178    0.93614  -0.013    0.990    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) s_age  prtc_M stdy_U p_M:_U s_g:_U
## s_age       -0.235                                   
## prtcpnt_sxM -0.621  0.180                            
## stdy_stUrbn -0.338  0.137  0.280                     
## prtcpn_M:_U  0.253 -0.111 -0.404 -0.729              
## s_g:stdy_sU  0.062 -0.263 -0.010  0.080 -0.016       
## s_g:prtcp_M  0.156 -0.669 -0.211 -0.120  0.144 -0.055